First load the required libraries
rm(list = ls())
library(vtable)
library(scales)
library(corrplot)
library(nortest)
library(randomForest)
library(foreign)
library(GGally)
library(haven)
library(magrittr)
library(data.table)
library(dplyr)
library(plyr)
library(nycflights13)
library(tidyverse)
library(datasets)
library(readxl)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(plyr)
library(factoextra)
library(readxl)
library(plotly)
library(naivebayes)
library(caTools)
library(devtools)
library(ggcorrplot)
library(usethis)
library(fastDummies)
library(recipes)
library(caretEnsemble)
library(readr)
library("gplots")
library(dominanceanalysis)
library(caTools)
library(randomForest)
library(xgboost)
library(data.table)
library(plyr)
library(nycflights13)
library(datasets)
library(readxl)
library(magrittr)
library(maps)
library(plotly)
library(plyr)
library(GGally)
library(readxl)
library(plotly)
library(graphics)
library(e1071)
library(caTools)
library(ggplot2)
library(caret)
library(caretEnsemble)
library(psych)
library(GGally)
library(rpart)
library(randomForest)
library(readr)
library(vtable)
library(scales)
library(gridExtra)
library(corrplot)
library(nortest)
library(class)
library(randomForest)
library(foreign)
library(foreign)
library(GGally)
library(data.table)
library(plyr)
library(ggmap)
library(nycflights13)
library(datasets)
library(readxl)
library(DataExplorer)
library(maps)
library(plotly)
library(plyr)
library(GGally)
library(readxl)
library(plotly)
library(mice)
library(caTools)
library(lattice)
library(ggcorrplot)
library(usethis)
library(fastDummies)
library(recipes)
library(GGally)
library(caretEnsemble)
library(Amelia)
library(GGally)
library(randomForest)
library(readr)
library(aod)
library("gplots")
library(caret)
library(dominanceanalysis)
library(caTools)
library(randomForest) # for fitting RFs
library(skimr)
library(GGally)
library(plotly)
library(viridis)
library(caret)
library(randomForest)
library(rpart.plot)
library(corrgram)
library(h2o)
library(ggthemes)
library(treemap)
library(treemapify)
library(repr)
library(cowplot)
library(magrittr)
library(ggpubr)
library(RColorBrewer)
library(plotrix)
library(ggrepel)
library(forcats)
library(reshape2)
library(caTools)
library(tree)
library(rattle)
library(foreign)
library(haven)
library(ggplot2)
library(foreign)
library(ggplot2)
library(GGally)
library(haven)
library(magrittr)
library(data.table)
library(dplyr)
library(plyr)
library(dplyr)
library(factoextra)
library(ggplot2)
library(ggmap)
library(nycflights13)
library(tidyverse)
library(datasets)
library(readxl)
library(tidyverse)
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(plyr)
library(gridExtra)
library(factoextra)
library(GGally)
library(readxl)
library(tidyverse)
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(gridExtra)
library(factoextra)
library(GGally)
library(gridExtra)
library(graphics)
library(mice)
library(naivebayes)
library(e1071)
library(caTools)
library(lattice)
library(ggplot2)
library(tidyverse)
library(caret)
library(caretEnsemble)
library(psych)
library(Amelia)
library(mice)
library(GGally)
library(rpart)
library(randomForest)
library(scales)
library(readr)
options(repr.plot.width=8, repr.plot.height=6)
options(warn=-1)
Lets import the data sets
df=read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/CaseStudy2-data.csv")
cs2.NoAttrition = read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Attrition.csv",stringsAsFactors = TRUE)
cs2.NoSalary = read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Salary.csv",stringsAsFactors = TRUE)
#visualize the missing data
sum(is.na(df))
## [1] 0
(sum(is.na(df))/prod(dim(df)))*100
## [1] 0
#================quick look at data and data processesing========
df %>% group_by(JobRole) %>% summarise(n=n()) %>% arrange(desc(n))
## # A tibble: 9 × 2
## JobRole n
## <chr> <int>
## 1 Sales Executive 200
## 2 Research Scientist 172
## 3 Laboratory Technician 153
## 4 Manufacturing Director 87
## 5 Healthcare Representative 76
## 6 Sales Representative 53
## 7 Manager 51
## 8 Research Director 51
## 9 Human Resources 27
df$Educational_Levels <- ifelse(df$Education == 1, "Without College D.",
ifelse(df$Education == 2 , "College D.",
ifelse(df$Education == 3, "Bachelors D.",
ifelse(df$Education == 4, "Masters D.", "Phd D."))))
st(df)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| ID | 870 | 435.5 | 251.292 | 1 | 218.25 | 652.75 | 870 |
| Age | 870 | 36.829 | 8.926 | 18 | 30 | 43 | 60 |
| Attrition | 870 | ||||||
| … No | 730 | 83.9% | |||||
| … Yes | 140 | 16.1% | |||||
| BusinessTravel | 870 | ||||||
| … Non-Travel | 94 | 10.8% | |||||
| … Travel_Frequently | 158 | 18.2% | |||||
| … Travel_Rarely | 618 | 71% | |||||
| DailyRate | 870 | 815.228 | 401.116 | 103 | 472.5 | 1165.75 | 1499 |
| Department | 870 | ||||||
| … Human Resources | 35 | 4% | |||||
| … Research & Development | 562 | 64.6% | |||||
| … Sales | 273 | 31.4% | |||||
| DistanceFromHome | 870 | 9.339 | 8.137 | 1 | 2 | 14 | 29 |
| Education | 870 | 2.901 | 1.023 | 1 | 2 | 4 | 5 |
| EducationField | 870 | ||||||
| … Human Resources | 15 | 1.7% | |||||
| … Life Sciences | 358 | 41.1% | |||||
| … Marketing | 100 | 11.5% | |||||
| … Medical | 270 | 31% | |||||
| … Other | 52 | 6% | |||||
| … Technical Degree | 75 | 8.6% | |||||
| EmployeeCount | 870 | 1 | 0 | 1 | 1 | 1 | 1 |
| EmployeeNumber | 870 | 1029.832 | 604.789 | 1 | 477.25 | 1561.5 | 2064 |
| EnvironmentSatisfaction | 870 | 2.701 | 1.099 | 1 | 2 | 4 | 4 |
| Gender | 870 | ||||||
| … Female | 354 | 40.7% | |||||
| … Male | 516 | 59.3% | |||||
| HourlyRate | 870 | 65.614 | 20.127 | 30 | 48 | 83 | 100 |
| JobInvolvement | 870 | 2.723 | 0.704 | 1 | 2 | 3 | 4 |
| JobLevel | 870 | 2.039 | 1.09 | 1 | 1 | 3 | 5 |
| JobSatisfaction | 870 | 2.709 | 1.115 | 1 | 2 | 4 | 4 |
| MaritalStatus | 870 | ||||||
| … Divorced | 191 | 22% | |||||
| … Married | 410 | 47.1% | |||||
| … Single | 269 | 30.9% | |||||
| MonthlyIncome | 870 | 6390.264 | 4597.696 | 1081 | 2839.5 | 8182 | 19999 |
| MonthlyRate | 870 | 14325.621 | 7108.382 | 2094 | 8092 | 20456.25 | 26997 |
| NumCompaniesWorked | 870 | 2.728 | 2.52 | 0 | 1 | 4 | 9 |
| Over18 | 870 | ||||||
| … Y | 870 | 100% | |||||
| OverTime | 870 | ||||||
| … No | 618 | 71% | |||||
| … Yes | 252 | 29% | |||||
| PercentSalaryHike | 870 | 15.2 | 3.675 | 11 | 12 | 18 | 25 |
| PerformanceRating | 870 | 3.152 | 0.359 | 3 | 3 | 3 | 4 |
| RelationshipSatisfaction | 870 | 2.707 | 1.102 | 1 | 2 | 4 | 4 |
| StandardHours | 870 | 80 | 0 | 80 | 80 | 80 | 80 |
| StockOptionLevel | 870 | 0.784 | 0.858 | 0 | 0 | 1 | 3 |
| TotalWorkingYears | 870 | 11.053 | 7.514 | 0 | 6 | 15 | 40 |
| TrainingTimesLastYear | 870 | 2.832 | 1.273 | 0 | 2 | 3 | 6 |
| WorkLifeBalance | 870 | 2.782 | 0.712 | 1 | 2 | 3 | 4 |
| YearsAtCompany | 870 | 6.962 | 6.021 | 0 | 3 | 10 | 40 |
| YearsInCurrentRole | 870 | 4.205 | 3.639 | 0 | 2 | 7 | 18 |
| YearsSinceLastPromotion | 870 | 2.169 | 3.186 | 0 | 0 | 3 | 15 |
| YearsWithCurrManager | 870 | 4.14 | 3.574 | 0 | 2 | 7 | 17 |
| Educational_Levels | 870 | ||||||
| … Bachelors D. | 324 | 37.2% | |||||
| … College D. | 182 | 20.9% | |||||
| … Masters D. | 240 | 27.6% | |||||
| … Phd D. | 26 | 3% | |||||
| … Without College D. | 98 | 11.3% |
Define the type of variables
df$WorkLifeBalance = as.factor(df$WorkLifeBalance)
df$JobRole = as.factor(df$JobRole )
df$JobInvolvement=as.factor(df$JobInvolvement)
df$JobSatisfaction=as.factor(df$JobSatisfaction)
df$JobLevel=as.factor(df$JobLevel)
df$JobSatisfaction = as.factor(df$JobSatisfaction)
df$ TrainingTimesLastYear = as.factor(df$TrainingTimesLastYear)
df$ PerformanceRating = as.factor(df$ PerformanceRating)
df$StockOptionLevel = as.factor(df$StockOptionLevel)
df$RelationshipSatisfaction = as.factor(df$RelationshipSatisfaction)
df$Education = as.factor(df$Education)
df$EnvironmentSatisfaction=as.factor(df$EnvironmentSatisfaction)
df$BusinessTravel=as.factor(df$BusinessTravel)
df$JobSatisfaction=as.factor(df$JobSatisfaction)
df$EnvironmentSatisfaction=as.factor(df$EnvironmentSatisfaction)
df$PerformanceRating=as.factor(df$PerformanceRating)
df$TrainingTimesLastYear=as.factor(df$TrainingTimesLastYear)
df$RelationshipSatisfaction=as.factor(df$RelationshipSatisfaction)
df$WorkLifeBalance=as.factor(df$WorkLifeBalance)
df$Attrition <- as.factor(df$Attrition)
table(df$Attrition)
##
## No Yes
## 730 140
cs2.NoAttrition$WorkLifeBalance = as.factor(cs2.NoAttrition$WorkLifeBalance)
cs2.NoAttrition$JobRole = as.factor(cs2.NoAttrition$JobRole )
cs2.NoAttrition$JobInvolvement=as.factor(cs2.NoAttrition$JobInvolvement)
cs2.NoAttrition$JobSatisfaction=as.factor(cs2.NoAttrition$JobSatisfaction)
cs2.NoAttrition$JobLevel=as.factor(cs2.NoAttrition$JobLevel)
cs2.NoAttrition$JobSatisfaction = as.factor(cs2.NoAttrition$JobSatisfaction)
cs2.NoAttrition$ TrainingTimesLastYear = as.factor(cs2.NoAttrition$TrainingTimesLastYear)
cs2.NoAttrition$ PerformanceRating = as.factor(cs2.NoAttrition$ PerformanceRating)
cs2.NoAttrition$StockOptionLevel = as.factor(cs2.NoAttrition$StockOptionLevel)
cs2.NoAttrition$RelationshipSatisfaction = as.factor(cs2.NoAttrition$RelationshipSatisfaction)
cs2.NoAttrition$Education = as.factor(cs2.NoAttrition$Education)
cs2.NoAttrition$EnvironmentSatisfaction=as.factor(cs2.NoAttrition$EnvironmentSatisfaction)
cs2.NoAttrition$BusinessTravel=as.factor(cs2.NoAttrition$BusinessTravel)
cs2.NoAttrition$JobSatisfaction=as.factor(cs2.NoAttrition$JobSatisfaction)
cs2.NoAttrition$EnvironmentSatisfaction=as.factor(cs2.NoAttrition$EnvironmentSatisfaction)
cs2.NoAttrition$PerformanceRating=as.factor(cs2.NoAttrition$PerformanceRating)
cs2.NoAttrition$TrainingTimesLastYear=as.factor(cs2.NoAttrition$TrainingTimesLastYear)
cs2.NoAttrition$RelationshipSatisfaction=as.factor(cs2.NoAttrition$RelationshipSatisfaction)
cs2.NoAttrition$WorkLifeBalance=as.factor(cs2.NoAttrition$WorkLifeBalance)
cs2.NoSalary$WorkLifeBalance = as.factor(cs2.NoSalary$WorkLifeBalance)
cs2.NoSalary$JobRole = as.factor(cs2.NoSalary$JobRole )
cs2.NoSalary$JobInvolvement=as.factor(cs2.NoSalary$JobInvolvement)
cs2.NoSalary$JobSatisfaction=as.factor(cs2.NoSalary$JobSatisfaction)
cs2.NoSalary$JobLevel=as.factor(cs2.NoSalary$JobLevel)
cs2.NoSalary$JobSatisfaction = as.factor(cs2.NoSalary$JobSatisfaction)
cs2.NoSalary$ TrainingTimesLastYear = as.factor(cs2.NoSalary$TrainingTimesLastYear)
cs2.NoSalary$ PerformanceRating = as.factor(cs2.NoSalary$ PerformanceRating)
cs2.NoSalary$StockOptionLevel = as.factor(cs2.NoSalary$StockOptionLevel)
cs2.NoSalary$RelationshipSatisfaction = as.factor(cs2.NoSalary$RelationshipSatisfaction)
cs2.NoSalary$Education = as.factor(cs2.NoSalary$Education)
cs2.NoSalary$EnvironmentSatisfaction=as.factor(cs2.NoSalary$EnvironmentSatisfaction)
cs2.NoSalary$BusinessTravel=as.factor(cs2.NoSalary$BusinessTravel)
cs2.NoSalary$JobSatisfaction=as.factor(cs2.NoSalary$JobSatisfaction)
cs2.NoSalary$EnvironmentSatisfaction=as.factor(cs2.NoSalary$EnvironmentSatisfaction)
cs2.NoSalary$PerformanceRating=as.factor(cs2.NoSalary$PerformanceRating)
cs2.NoSalary$TrainingTimesLastYear=as.factor(cs2.NoSalary$TrainingTimesLastYear)
cs2.NoSalary$RelationshipSatisfaction=as.factor(cs2.NoSalary$RelationshipSatisfaction)
cs2.NoSalary$WorkLifeBalance=as.factor(cs2.NoSalary$WorkLifeBalance)
cs2.NoSalary$Attrition <- as.factor(cs2.NoSalary$Attrition)
Attrition.Yes = subset(df, Attrition == "Yes")
Attrition.No = subset(df, Attrition == "No")
(d1 = as.data.frame(table(sapply(df, class))))
## Var1 Freq
## 1 character 7
## 2 factor 13
## 3 integer 17
ggplot(d1, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity", col = "blue", fill = "lightblue") +
labs(x = "Type of Class", y = "Frequency", title = "Column type Frequency")+
theme_bw()
st(df, group = 'Attrition', group.long = TRUE)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| Attrition: No | |||||||
| ID | 730 | 430.301 | 251.324 | 1 | 213.25 | 645.25 | 870 |
| Age | 730 | 37.412 | 8.673 | 18 | 31 | 43 | 60 |
| BusinessTravel | 730 | ||||||
| … Non-Travel | 83 | 11.4% | |||||
| … Travel_Frequently | 123 | 16.8% | |||||
| … Travel_Rarely | 524 | 71.8% | |||||
| DailyRate | 730 | 821.16 | 401.414 | 111 | 483.75 | 1178.25 | 1499 |
| Department | 730 | ||||||
| … Human Resources | 29 | 4% | |||||
| … Research & Development | 487 | 66.7% | |||||
| … Sales | 214 | 29.3% | |||||
| DistanceFromHome | 730 | 9.029 | 7.983 | 1 | 2 | 13 | 29 |
| Education | 730 | ||||||
| … 1 | 80 | 11% | |||||
| … 2 | 150 | 20.5% | |||||
| … 3 | 269 | 36.8% | |||||
| … 4 | 208 | 28.5% | |||||
| … 5 | 23 | 3.2% | |||||
| EducationField | 730 | ||||||
| … Human Resources | 11 | 1.5% | |||||
| … Life Sciences | 305 | 41.8% | |||||
| … Marketing | 80 | 11% | |||||
| … Medical | 233 | 31.9% | |||||
| … Other | 43 | 5.9% | |||||
| … Technical Degree | 58 | 7.9% | |||||
| EmployeeCount | 730 | 1 | 0 | 1 | 1 | 1 | 1 |
| EmployeeNumber | 730 | 1035.866 | 606.517 | 11 | 476.25 | 1571.5 | 2064 |
| EnvironmentSatisfaction | 730 | ||||||
| … 1 | 130 | 17.8% | |||||
| … 2 | 154 | 21.1% | |||||
| … 3 | 223 | 30.5% | |||||
| … 4 | 223 | 30.5% | |||||
| Gender | 730 | ||||||
| … Female | 301 | 41.2% | |||||
| … Male | 429 | 58.8% | |||||
| HourlyRate | 730 | 65.292 | 20.203 | 30 | 48 | 82.75 | 100 |
| JobInvolvement | 730 | ||||||
| … 1 | 25 | 3.4% | |||||
| … 2 | 184 | 25.2% | |||||
| … 3 | 447 | 61.2% | |||||
| … 4 | 74 | 10.1% | |||||
| JobLevel | 730 | ||||||
| … 1 | 243 | 33.3% | |||||
| … 2 | 282 | 38.6% | |||||
| … 3 | 115 | 15.8% | |||||
| … 4 | 57 | 7.8% | |||||
| … 5 | 33 | 4.5% | |||||
| JobRole | 730 | ||||||
| … Healthcare Representative | 68 | 9.3% | |||||
| … Human Resources | 21 | 2.9% | |||||
| … Laboratory Technician | 123 | 16.8% | |||||
| … Manager | 47 | 6.4% | |||||
| … Manufacturing Director | 85 | 11.6% | |||||
| … Research Director | 50 | 6.8% | |||||
| … Research Scientist | 140 | 19.2% | |||||
| … Sales Executive | 167 | 22.9% | |||||
| … Sales Representative | 29 | 4% | |||||
| JobSatisfaction | 730 | ||||||
| … 1 | 141 | 19.3% | |||||
| … 2 | 135 | 18.5% | |||||
| … 3 | 211 | 28.9% | |||||
| … 4 | 243 | 33.3% | |||||
| MaritalStatus | 730 | ||||||
| … Divorced | 179 | 24.5% | |||||
| … Married | 352 | 48.2% | |||||
| … Single | 199 | 27.3% | |||||
| MonthlyIncome | 730 | 6702 | 4675.472 | 1129 | 3162 | 8736.5 | 19999 |
| MonthlyRate | 730 | 14460.123 | 7126.983 | 2094 | 8191.25 | 20644.75 | 26997 |
| NumCompaniesWorked | 730 | 2.66 | 2.466 | 0 | 1 | 4 | 9 |
| Over18 | 730 | ||||||
| … Y | 730 | 100% | |||||
| OverTime | 730 | ||||||
| … No | 558 | 76.4% | |||||
| … Yes | 172 | 23.6% | |||||
| PercentSalaryHike | 730 | 15.175 | 3.627 | 11 | 12 | 18 | 25 |
| PerformanceRating | 730 | ||||||
| … 3 | 621 | 85.1% | |||||
| … 4 | 109 | 14.9% | |||||
| RelationshipSatisfaction | 730 | ||||||
| … 1 | 139 | 19% | |||||
| … 2 | 144 | 19.7% | |||||
| … 3 | 225 | 30.8% | |||||
| … 4 | 222 | 30.4% | |||||
| StandardHours | 730 | 80 | 0 | 80 | 80 | 80 | 80 |
| StockOptionLevel | 730 | ||||||
| … 0 | 281 | 38.5% | |||||
| … 1 | 328 | 44.9% | |||||
| … 2 | 78 | 10.7% | |||||
| … 3 | 43 | 5.9% | |||||
| TotalWorkingYears | 730 | 11.603 | 7.459 | 0 | 6 | 15 | 37 |
| TrainingTimesLastYear | 730 | ||||||
| … 0 | 22 | 3% | |||||
| … 1 | 34 | 4.7% | |||||
| … 2 | 252 | 34.5% | |||||
| … 3 | 265 | 36.3% | |||||
| … 4 | 57 | 7.8% | |||||
| … 5 | 68 | 9.3% | |||||
| … 6 | 32 | 4.4% | |||||
| WorkLifeBalance | 730 | ||||||
| … 1 | 31 | 4.2% | |||||
| … 2 | 162 | 22.2% | |||||
| … 3 | 452 | 61.9% | |||||
| … 4 | 85 | 11.6% | |||||
| YearsAtCompany | 730 | 7.301 | 5.936 | 0 | 3 | 10 | 33 |
| YearsInCurrentRole | 730 | 4.453 | 3.645 | 0 | 2 | 7 | 18 |
| YearsSinceLastPromotion | 730 | 2.175 | 3.147 | 0 | 0 | 3 | 15 |
| YearsWithCurrManager | 730 | 4.37 | 3.591 | 0 | 2 | 7 | 17 |
| Educational_Levels | 730 | ||||||
| … Bachelors D. | 269 | 36.8% | |||||
| … College D. | 150 | 20.5% | |||||
| … Masters D. | 208 | 28.5% | |||||
| … Phd D. | 23 | 3.2% | |||||
| … Without College D. | 80 | 11% | |||||
| Attrition: Yes | |||||||
| ID | 140 | 462.607 | 250.266 | 28 | 259.25 | 687.5 | 863 |
| Age | 140 | 33.786 | 9.615 | 18 | 28 | 39 | 58 |
| BusinessTravel | 140 | ||||||
| … Non-Travel | 11 | 7.9% | |||||
| … Travel_Frequently | 35 | 25% | |||||
| … Travel_Rarely | 94 | 67.1% | |||||
| DailyRate | 140 | 784.293 | 399.564 | 103 | 428.75 | 1110.75 | 1496 |
| Department | 140 | ||||||
| … Human Resources | 6 | 4.3% | |||||
| … Research & Development | 75 | 53.6% | |||||
| … Sales | 59 | 42.1% | |||||
| DistanceFromHome | 140 | 10.957 | 8.749 | 1 | 3 | 19 | 29 |
| Education | 140 | ||||||
| … 1 | 18 | 12.9% | |||||
| … 2 | 32 | 22.9% | |||||
| … 3 | 55 | 39.3% | |||||
| … 4 | 32 | 22.9% | |||||
| … 5 | 3 | 2.1% | |||||
| EducationField | 140 | ||||||
| … Human Resources | 4 | 2.9% | |||||
| … Life Sciences | 53 | 37.9% | |||||
| … Marketing | 20 | 14.3% | |||||
| … Medical | 37 | 26.4% | |||||
| … Other | 9 | 6.4% | |||||
| … Technical Degree | 17 | 12.1% | |||||
| EmployeeCount | 140 | 1 | 0 | 1 | 1 | 1 | 1 |
| EmployeeNumber | 140 | 998.371 | 596.858 | 1 | 483.25 | 1508.5 | 2027 |
| EnvironmentSatisfaction | 140 | ||||||
| … 1 | 42 | 30% | |||||
| … 2 | 24 | 17.1% | |||||
| … 3 | 35 | 25% | |||||
| … 4 | 39 | 27.9% | |||||
| Gender | 140 | ||||||
| … Female | 53 | 37.9% | |||||
| … Male | 87 | 62.1% | |||||
| HourlyRate | 140 | 67.293 | 19.712 | 32 | 51 | 84 | 100 |
| JobInvolvement | 140 | ||||||
| … 1 | 22 | 15.7% | |||||
| … 2 | 44 | 31.4% | |||||
| … 3 | 67 | 47.9% | |||||
| … 4 | 7 | 5% | |||||
| JobLevel | 140 | ||||||
| … 1 | 86 | 61.4% | |||||
| … 2 | 30 | 21.4% | |||||
| … 3 | 17 | 12.1% | |||||
| … 4 | 3 | 2.1% | |||||
| … 5 | 4 | 2.9% | |||||
| JobRole | 140 | ||||||
| … Healthcare Representative | 8 | 5.7% | |||||
| … Human Resources | 6 | 4.3% | |||||
| … Laboratory Technician | 30 | 21.4% | |||||
| … Manager | 4 | 2.9% | |||||
| … Manufacturing Director | 2 | 1.4% | |||||
| … Research Director | 1 | 0.7% | |||||
| … Research Scientist | 32 | 22.9% | |||||
| … Sales Executive | 33 | 23.6% | |||||
| … Sales Representative | 24 | 17.1% | |||||
| JobSatisfaction | 140 | ||||||
| … 1 | 38 | 27.1% | |||||
| … 2 | 31 | 22.1% | |||||
| … 3 | 43 | 30.7% | |||||
| … 4 | 28 | 20% | |||||
| MaritalStatus | 140 | ||||||
| … Divorced | 12 | 8.6% | |||||
| … Married | 58 | 41.4% | |||||
| … Single | 70 | 50% | |||||
| MonthlyIncome | 140 | 4764.786 | 3786.389 | 1081 | 2341.5 | 5838.75 | 19859 |
| MonthlyRate | 140 | 13624.286 | 6993.816 | 2396 | 8054.25 | 19498 | 26959 |
| NumCompaniesWorked | 140 | 3.079 | 2.772 | 0 | 1 | 5 | 9 |
| Over18 | 140 | ||||||
| … Y | 140 | 100% | |||||
| OverTime | 140 | ||||||
| … No | 60 | 42.9% | |||||
| … Yes | 80 | 57.1% | |||||
| PercentSalaryHike | 140 | 15.329 | 3.928 | 11 | 12 | 18 | 25 |
| PerformanceRating | 140 | ||||||
| … 3 | 117 | 83.6% | |||||
| … 4 | 23 | 16.4% | |||||
| RelationshipSatisfaction | 140 | ||||||
| … 1 | 35 | 25% | |||||
| … 2 | 27 | 19.3% | |||||
| … 3 | 36 | 25.7% | |||||
| … 4 | 42 | 30% | |||||
| StandardHours | 140 | 80 | 0 | 80 | 80 | 80 | 80 |
| StockOptionLevel | 140 | ||||||
| … 0 | 98 | 70% | |||||
| … 1 | 27 | 19.3% | |||||
| … 2 | 3 | 2.1% | |||||
| … 3 | 12 | 8.6% | |||||
| TotalWorkingYears | 140 | 8.186 | 7.162 | 0 | 3 | 10 | 40 |
| TrainingTimesLastYear | 140 | ||||||
| … 0 | 8 | 5.7% | |||||
| … 1 | 5 | 3.6% | |||||
| … 2 | 57 | 40.7% | |||||
| … 3 | 43 | 30.7% | |||||
| … 4 | 16 | 11.4% | |||||
| … 5 | 7 | 5% | |||||
| … 6 | 4 | 2.9% | |||||
| WorkLifeBalance | 140 | ||||||
| … 1 | 17 | 12.1% | |||||
| … 2 | 30 | 21.4% | |||||
| … 3 | 80 | 57.1% | |||||
| … 4 | 13 | 9.3% | |||||
| YearsAtCompany | 140 | 5.193 | 6.171 | 0 | 1 | 8 | 40 |
| YearsInCurrentRole | 140 | 2.907 | 3.333 | 0 | 0 | 4 | 15 |
| YearsSinceLastPromotion | 140 | 2.136 | 3.395 | 0 | 0 | 2 | 15 |
| YearsWithCurrManager | 140 | 2.943 | 3.245 | 0 | 0 | 6 | 14 |
| Educational_Levels | 140 | ||||||
| … Bachelors D. | 55 | 39.3% | |||||
| … College D. | 32 | 22.9% | |||||
| … Masters D. | 32 | 22.9% | |||||
| … Phd D. | 3 | 2.1% | |||||
| … Without College D. | 18 | 12.9% |
cols = c("#4c86ad", "#f5dfb3")
df %>%
dplyr::select(Attrition,MonthlyIncome,YearsSinceLastPromotion,YearsWithCurrManager,YearsAtCompany,YearsInCurrentRole,TotalWorkingYears ) %>%
GGally::ggpairs(
lower = list(
continuous = GGally::wrap("points", col = cols[1],alpha=0.6),
combo = GGally::wrap("box", fill = "white", col ="black")
),
upper = list(
continuous = GGally::wrap("cor", col = cols[1]),
combo = GGally::wrap("facetdensity", col = "black")
),
diag = list(
continuous = GGally::wrap("barDiag", fill = cols[2], col ="black", bins = 18),
discrete = GGally::wrap("barDiag", fill = cols[2], col ="black"))
)
cols = c("#4c86ad", "#f5dfb3")
df %>%
dplyr::select(Attrition,PercentSalaryHike,MonthlyIncome,HourlyRate,MonthlyRate,DistanceFromHome ) %>%
GGally::ggpairs(
lower = list(
continuous = GGally::wrap("points", col = cols[1],alpha=0.6),
combo = GGally::wrap("box", fill = "white", col ="black")
),
upper = list(
continuous = GGally::wrap("cor", col = cols[1]),
combo = GGally::wrap("facetdensity", col = "black")
),
diag = list(
continuous = GGally::wrap("barDiag", fill = cols[2], col ="black", bins = 18),
discrete = GGally::wrap("barDiag", fill = cols[2], col ="black"))
)
#EDA of bivariate data
#Monthly Income by Gender
Income_by_Gender <- ggplot(df, aes(x=Gender, y=MonthlyIncome, color=Gender, fill=Gender)) + geom_boxplot() +
scale_fill_manual(values=c("#F5A9F2", "#5882FA")) + scale_color_manual(values=c("#FE2EF7", "#5858FA")) +
coord_flip() + labs(title="Are there any Gender Disparities in Income?")
Income_by_Gender
Plot the relationship between categorical variables with Attrition:
#-------More graphs to explore the data-----
ggplot(df, aes(OverTime, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobInvolvement, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(StockOptionLevel, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(NumCompaniesWorked, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = JobRole), position = "dodge")
ggplot(df, aes(WorkLifeBalance , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(TrainingTimesLastYear , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(PerformanceRating , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(StockOptionLevel , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(RelationshipSatisfaction , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(EnvironmentSatisfaction , ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(Education, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(BusinessTravel, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(EnvironmentSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(NumCompaniesWorked, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(PerformanceRating, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(RelationshipSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(TrainingTimesLastYear, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(WorkLifeBalance, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(OverTime, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobInvolvement, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(JobLevel, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
ggplot(df, aes(OverTime, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
#distribution of job satisfaction in attrition.
ggplot(df, aes(JobSatisfaction, ..count..)) + geom_bar(aes(fill = Attrition), position = "dodge")
#————continious variables————-
#Evaluation the numeric variables in those with and without attrition using boxplot
ggplot(df, aes(x=Attrition, y=PercentSalaryHike)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y=MonthlyIncome)) + geom_boxplot(fill='green')
ggplot(df, aes(x=JobRole, y=MonthlyIncome)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y= HourlyRate)) + geom_boxplot(fill='green')
ggplot(df, aes(x=JobSatisfaction, y= HourlyRate)) + geom_boxplot(fill='green')
ggplot(df, aes(x=JobSatisfaction, y=MonthlyIncome)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y=DistanceFromHome)) + geom_boxplot(fill='green')
ggplot(df, aes(x=JobSatisfaction, y=DistanceFromHome)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y=YearsSinceLastPromotion)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y=YearsWithCurrManager)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y=YearsAtCompany)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y=YearsInCurrentRole)) + geom_boxplot(fill='green')
ggplot(df, aes(x=Attrition, y=TotalWorkingYears)) + geom_boxplot(fill='green')
#Evaluation the numeric variables in those with and without attrition using scatterplot
ggplot(df,aes(TotalWorkingYears,MonthlyIncome,color=Attrition))+
geom_point(shape=4,size=2)+
geom_smooth(method=lm,se=F)
ggplot(df,aes(YearsInCurrentRole,MonthlyIncome,color=Attrition))+
geom_point(shape=4,size=2)+
geom_smooth(method=loess,se=F)
ggplot(df,aes(YearsAtCompany,MonthlyIncome,color=Attrition))+
geom_point(shape=4,size=2)+
geom_smooth(method=loess,se=F)
ggplot(df,aes(YearsSinceLastPromotion,YearsInCurrentRole,color=Attrition))+
geom_point(shape=4,size=2)+
geom_smooth(method=loess,se=F)
ggplot(df,aes(YearsSinceLastPromotion,YearsInCurrentRole,color=Attrition))+
geom_point(shape=4,size=2)+
geom_smooth(method=loess,se=F)
ggplot(df,aes(TotalWorkingYears,YearsSinceLastPromotion,color=Attrition))+
geom_point(shape=4,size=2)+
geom_smooth(method=loess,se=F)
Evaluation of monthly income using histogram
### 1. Monthly Income Variable
ggplot(df, aes(x = MonthlyIncome)) +
geom_histogram(aes(y = stat(density)), col = "blue", fill = "gold") +
geom_density(col = "red", size = 1) +
labs(x = "Monthly Income (Salary)",y = " ",
title = "Histogram for Monthly Income") +
theme_bw()
range(df$MonthlyIncome)
## [1] 1081 19999
# Monthly Income variable is not normally distributed and variable is positively skewed.
ggplot(df, aes(x = Attrition, y = prop.table(stat(count)),
label = scales::percent(prop.table(stat(count))))) +
geom_bar(col = "blue", fill = "lightblue")+
geom_text(stat = 'count', size = 5)+
labs(y = "Frequency", title = "Barplot for Attrition") +
theme_bw()
# In this data set 16% percent belongs to Attrition Yes group.
ggplot(df, aes(x = Attrition, y = MonthlyIncome)) +
geom_boxplot(col = "blue", fill = "gold") +
labs(y = "Monthly Income (Salary)",x = "Attrition",
title = "Boxplots of Monthly Income for Attrition groups") +
theme_bw()
# According to the median values, employees who have lower salary tends to leave their current jobs.
ggplot(df, aes(x = Age, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(y = "Monthly Income (Salary)",x = "Age",
title = "Scatter plot for Monthly Income vs Age for Attrition") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(df$Age, df$MonthlyIncome)
## [1] 0.4842883
# There is a positive relationship between Monthly income and Age for both groups. When Age increases, Monthly income also increase.
ggplot(df, aes(x = BusinessTravel, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Business Travel", y = "Attrition",
title = "Business Travel with Attrition") +
theme_bw()
ggplot(df, aes(x = BusinessTravel, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Business Travel", y = "Montly Income",
title = "Montly Income for Business Travel groups") +
theme_bw()
# Employees who are travel frequently have highest attrition rates and Non- travel group has lowest Income.
p1 = ggplot(df, aes(x = HourlyRate, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Hourly Rate", y = "Monthly Income", title = "Monthly Income vs Hourly Rate") +
theme_bw() +
facet_wrap( ~ Attrition)
p2 = ggplot(df, aes(x = DailyRate, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Daily Rate", y = "Monthly Income", title = "Monthly Income vs Daily Rate") +
theme_bw() +
facet_wrap( ~ Attrition)
p3 = ggplot(df, aes(x = MonthlyRate, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Monthly Rate", y = "Monthly Income", title = "Monthly Income vs Monthly Rate") +
theme_bw() +
facet_wrap( ~ Attrition)
grid.arrange(p1,p2,p3)
round(cor(Attrition.Yes[c("HourlyRate","DailyRate","MonthlyRate","MonthlyIncome","DistanceFromHome","TotalWorkingYears","YearsInCurrentRole","YearsAtCompany","YearsWithCurrManager","YearsSinceLastPromotion")]),2)
## HourlyRate DailyRate MonthlyRate MonthlyIncome
## HourlyRate 1.00 0.19 -0.01 0.07
## DailyRate 0.19 1.00 -0.03 0.13
## MonthlyRate -0.01 -0.03 1.00 0.13
## MonthlyIncome 0.07 0.13 0.13 1.00
## DistanceFromHome 0.03 -0.08 -0.05 -0.01
## TotalWorkingYears 0.08 0.09 -0.02 0.74
## YearsInCurrentRole -0.04 -0.05 -0.05 0.58
## YearsAtCompany 0.01 -0.06 -0.06 0.65
## YearsWithCurrManager -0.03 0.01 -0.10 0.50
## YearsSinceLastPromotion -0.04 -0.07 -0.04 0.47
## DistanceFromHome TotalWorkingYears YearsInCurrentRole
## HourlyRate 0.03 0.08 -0.04
## DailyRate -0.08 0.09 -0.05
## MonthlyRate -0.05 -0.02 -0.05
## MonthlyIncome -0.01 0.74 0.58
## DistanceFromHome 1.00 0.01 -0.01
## TotalWorkingYears 0.01 1.00 0.64
## YearsInCurrentRole -0.01 0.64 1.00
## YearsAtCompany 0.01 0.78 0.82
## YearsWithCurrManager -0.01 0.62 0.81
## YearsSinceLastPromotion 0.04 0.59 0.67
## YearsAtCompany YearsWithCurrManager
## HourlyRate 0.01 -0.03
## DailyRate -0.06 0.01
## MonthlyRate -0.06 -0.10
## MonthlyIncome 0.65 0.50
## DistanceFromHome 0.01 -0.01
## TotalWorkingYears 0.78 0.62
## YearsInCurrentRole 0.82 0.81
## YearsAtCompany 1.00 0.77
## YearsWithCurrManager 0.77 1.00
## YearsSinceLastPromotion 0.75 0.69
## YearsSinceLastPromotion
## HourlyRate -0.04
## DailyRate -0.07
## MonthlyRate -0.04
## MonthlyIncome 0.47
## DistanceFromHome 0.04
## TotalWorkingYears 0.59
## YearsInCurrentRole 0.67
## YearsAtCompany 0.75
## YearsWithCurrManager 0.69
## YearsSinceLastPromotion 1.00
round(cor(Attrition.No[c("HourlyRate","DailyRate","MonthlyRate","MonthlyIncome","DistanceFromHome","TotalWorkingYears","YearsInCurrentRole","YearsAtCompany","YearsWithCurrManager","YearsSinceLastPromotion")]),2)
## HourlyRate DailyRate MonthlyRate MonthlyIncome
## HourlyRate 1.00 0.02 -0.02 0.00
## DailyRate 0.02 1.00 -0.03 -0.03
## MonthlyRate -0.02 -0.03 1.00 0.05
## MonthlyIncome 0.00 -0.03 0.05 1.00
## DistanceFromHome 0.07 0.04 0.01 0.01
## TotalWorkingYears 0.03 -0.03 0.06 0.78
## YearsInCurrentRole 0.01 0.00 0.03 0.31
## YearsAtCompany 0.00 -0.04 -0.02 0.46
## YearsWithCurrManager 0.01 -0.04 -0.02 0.29
## YearsSinceLastPromotion 0.02 -0.06 0.02 0.30
## DistanceFromHome TotalWorkingYears YearsInCurrentRole
## HourlyRate 0.07 0.03 0.01
## DailyRate 0.04 -0.03 0.00
## MonthlyRate 0.01 0.06 0.03
## MonthlyIncome 0.01 0.78 0.31
## DistanceFromHome 1.00 0.02 0.01
## TotalWorkingYears 0.02 1.00 0.45
## YearsInCurrentRole 0.01 0.45 1.00
## YearsAtCompany -0.01 0.60 0.76
## YearsWithCurrManager 0.00 0.42 0.69
## YearsSinceLastPromotion -0.03 0.43 0.54
## YearsAtCompany YearsWithCurrManager
## HourlyRate 0.00 0.01
## DailyRate -0.04 -0.04
## MonthlyRate -0.02 -0.02
## MonthlyIncome 0.46 0.29
## DistanceFromHome -0.01 0.00
## TotalWorkingYears 0.60 0.42
## YearsInCurrentRole 0.76 0.69
## YearsAtCompany 1.00 0.76
## YearsWithCurrManager 0.76 1.00
## YearsSinceLastPromotion 0.63 0.48
## YearsSinceLastPromotion
## HourlyRate 0.02
## DailyRate -0.06
## MonthlyRate 0.02
## MonthlyIncome 0.30
## DistanceFromHome -0.03
## TotalWorkingYears 0.43
## YearsInCurrentRole 0.54
## YearsAtCompany 0.63
## YearsWithCurrManager 0.48
## YearsSinceLastPromotion 1.00
# There is no relationship between following variables for both Attrition groups.
# 1. Hourly Rate and Monthly Income
# 2. Daily Rate and Monthly Income
# 3. Monthly Rate and Monthly Income
ggplot(df, aes(x = Department, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Department", y = "Attrition",
title = "Department with Attrition") +
theme_bw()
ggplot(df, aes(x = Department, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Department", y = "Montly Income",
title = "Montly Income for Departments") +
theme_bw()
# Higher attrition rate has sales department and HR department has lowest median for monthly income.
ggplot(df, aes(x = DistanceFromHome, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Distance From Home", y = "Monthly Income",
title = "Monthly Income vs Distance From Home") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$DistanceFromHome, Attrition.Yes$MonthlyIncome)
## [1] -0.01230388
cor(Attrition.No$DistanceFromHome, Attrition.No$MonthlyIncome)
## [1] 0.0102446
df$Education = as.factor(df$Education)
ggplot(df, aes(x = Education, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Education", y = "Attrition",
title = "Education with Attrition") +
theme_bw()
ggplot(df, aes(x = Education, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Education", y = "Montly Income",
title = "Montly Income for Education") +
theme_bw()
# Highest attrition rates has education level 1.
# Highest median income has education level 5 group.
ggplot(df, aes(x = EducationField, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Education Filed", y = "Attrition",
title = "Education Filed with Attrition") +
theme_bw()
ggplot(df, aes(x = EducationField, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Education Filed", y = "Montly Income",
title = "Montly Income for Education Filed") +
theme_bw()
# Highest attrition rates has Human Resource education field.
# Highest median income has Marketing group.
df$EnvironmentSatisfaction = as.factor(df$EnvironmentSatisfaction)
ggplot(df, aes(x = EnvironmentSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Environment Satisfaction", y = "Attrition",
title = "Environment Satisfaction with Attrition") +
theme_bw()
ggplot(df, aes(x = EnvironmentSatisfaction, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Environment Satisfaction", y = "Montly Income",
title = "Montly Income for Environment Satisfaction") +
theme_bw()
# Employees who are less satisfy about their environment has higher attrition rates.
# Median incomes are very similar in all satisfaction levels.
ggplot(df, aes(x = Gender, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Gender", y = "Attrition",
title = "Gender with Attrition") +
theme_bw()
ggplot(df, aes(x = Gender, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Gender", y = "Montly Income",
title = "Montly Income for Gender") +
theme_bw()
# Both male and female group has similar attrition rates.
df$JobInvolvement = as.factor(df$JobInvolvement)
ggplot(df, aes(x = JobInvolvement, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Job Involvement", y = "Attrition",
title = "Job Involvement with Attrition") +
theme_bw()
ggplot(df, aes(x = JobInvolvement, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Job Involvement", y = "Montly Income",
title = "Montly Income for Job Involvement") +
theme_bw()
# Employees with lower job involvement, have very higher attrition rate.
df$JobLevel = as.factor(df$JobLevel)
ggplot(df, aes(x = JobLevel, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Job Level", y = "Attrition",
title = "Job Level with Attrition") +
theme_bw()
ggplot(df, aes(x = JobLevel, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Job Level", y = "Montly Income",
title = "Montly Income for Job Level") +
theme_bw()
# Employees who are in job level 1 have highest attrition rate.
# When job level increases, monthly income also increases.
ggplot(df, aes(x = JobRole, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Job Role", y = "Attrition",
title = "Job Role with Attrition") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
ggplot(df, aes(x = JobRole, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Job Role", y = "Montly Income",
title = "Montly Income for Job Role") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust=1))
# Sales representatives have higher attrition rate and lowest income.
# Managers abd Research directors have highest monthly income and lower attrition rate.
df$JobSatisfaction = as.factor(df$JobSatisfaction)
ggplot(df, aes(x = JobSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Job Satisfaction", y = "Attrition",
title = "Job Satisfaction with Attrition") +
theme_bw()
ggplot(df, aes(x = JobSatisfaction, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Job Satisfaction", y = "Montly Income",
title = "Montly Income for Job Satisfaction") +
theme_bw()
# Employees who are less satisfy with their jobs have higher attrition rates.
ggplot(df, aes(x = MaritalStatus, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Marital Status", y = "Attrition",
title = "Marital Status with Attrition") +
theme_bw()
ggplot(df, aes(x = MaritalStatus, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Marital Status", y = "Montly Income",
title = "Montly Income for Marital Status") +
theme_bw()
# Single employees have highest attrition rates.
ggplot(df, aes(x = OverTime, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Over Time", y = "Attrition",
title = "Over Time with Attrition") +
theme_bw()
ggplot(df, aes(x = OverTime, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Over Time", y = "Montly Income",
title = "Montly Income for Over Time") +
theme_bw()
# Employees who work over time have higher attrition rate and lower monthly income.
ggplot(df, aes(x = NumCompaniesWorked, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Number of Companies Worked", y = "Monthly Income",
title = "Monthly Income vs Number of Companies Worked") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$NumCompaniesWorked, Attrition.Yes$MonthlyIncome)
## [1] 0.1007262
cor(Attrition.No$NumCompaniesWorked, Attrition.No$MonthlyIncome)
## [1] 0.180646
# There is a poor relationship between Monthly income and number of companies worked.
ggplot(df, aes(x = PercentSalaryHike, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Percent Salary Hike", y = "Monthly Income",
title = "Monthly Income vs Percent Salary Hike") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$PercentSalaryHike, Attrition.Yes$MonthlyIncome)
## [1] -0.1406169
cor(Attrition.No$PercentSalaryHike, Attrition.No$MonthlyIncome)
## [1] -0.03764575
# It seems that Percent Salary Hike increases, monthly income decrease.
df$RelationshipSatisfaction = as.factor(df$RelationshipSatisfaction)
ggplot(df, aes(x = RelationshipSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Relationship Satisfaction", y = "Attrition",
title = "Relationship Satisfaction with Attrition") +
theme_bw()
ggplot(df, aes(x = RelationshipSatisfaction, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Relationship Satisfaction", y = "Montly Income",
title = "Montly Income for Relationship Satisfaction") +
theme_bw()
# Employees who have low relationship satisfaction, have a higher attrition rate. But median incomes are similar in all levels.
df$StockOptionLevel = as.factor(df$StockOptionLevel)
ggplot(df, aes(x = StockOptionLevel, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Stock Option Level", y = "Attrition",
title = "Stock Option Level with Attrition") +
theme_bw()
ggplot(df, aes(x = StockOptionLevel, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Stock Option Level", y = "Montly Income",
title = "Montly Income for Stock Option Level") +
theme_bw()
# Stock option levels 0 and 3 have higher attrition rates and lower median incomes.
ggplot(df, aes(x = TotalWorkingYears, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Total Working Years", y = "Monthly Income",
title = "Monthly Income vs Total Working Years") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$TotalWorkingYears, Attrition.Yes$MonthlyIncome)
## [1] 0.7360898
cor(Attrition.No$TotalWorkingYears, Attrition.No$MonthlyIncome)
## [1] 0.7795562
# There is strong positive relationship between Total Working Years and Monthly Income both attrition groups.
ggplot(df, aes(x = TrainingTimesLastYear, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Training Times Last Year", y = "Monthly Income",
title = "Monthly Income vs Training Times Last Year") +
theme_bw() +
facet_wrap( ~ Attrition)
df$WorkLifeBalance = as.factor(df$WorkLifeBalance)
ggplot(df, aes(x = WorkLifeBalance, fill = Attrition)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Work Life Balance", y = "Attrition",
title = "Work Life Balance with Attrition") +
theme_bw()
ggplot(df, aes(x = WorkLifeBalance, y = MonthlyIncome)) +
geom_boxplot(fill = "gold") +
labs(x = "Work Life Balance", y = "Montly Income",
title = "Montly Income for Work Life Balance") +
theme_bw()
# Employees who have poor work life balance have higher attrition rate and lowest median income.
ggplot(df, aes(x = YearsAtCompany, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Years At Company", y = "Monthly Income",
title = "Monthly Income vs Years At Company") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$YearsAtCompany, Attrition.Yes$MonthlyIncome)
## [1] 0.6450931
cor(Attrition.No$YearsAtCompany, Attrition.No$MonthlyIncome)
## [1] 0.456972
# There is higher positive relationship between Years At Company and Monthly Income both attrition groups.
ggplot(df, aes(x = YearsInCurrentRole, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Years In Current Role", y = "Monthly Income",
title = "Monthly Income vs Years In Current Role") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$YearsInCurrentRole, Attrition.Yes$MonthlyIncome)
## [1] 0.5768375
cor(Attrition.No$YearsInCurrentRole, Attrition.No$MonthlyIncome)
## [1] 0.3137409
# There is positive relationship between Years In Current Role and Monthly Income both attrition groups.
# Employees who work more than 15 years in current role, do not leave the company.
ggplot(df, aes(x = YearsSinceLastPromotion, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Years Since Last Promotion", y = "Monthly Income",
title = "Monthly Income vs Years Since Last Promotion") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$YearsSinceLastPromotion, Attrition.Yes$MonthlyIncome)
## [1] 0.4735753
cor(Attrition.No$YearsSinceLastPromotion, Attrition.No$MonthlyIncome)
## [1] 0.2951971
# There is positive relationship between Years Since Last Promotion and Monthly Income both attrition groups.
ggplot(df, aes(x = YearsWithCurrManager, y = MonthlyIncome)) +
geom_point(col = "blue") +
geom_smooth(method='lm', formula= y~x, se = F, col = "red", size = 1) +
labs(x = "Years With Current Manager", y = "Monthly Income",
title = "Monthly Income vs Years With Current Manager") +
theme_bw() +
facet_wrap( ~ Attrition)
cor(Attrition.Yes$YearsWithCurrManager, Attrition.Yes$MonthlyIncome)
## [1] 0.4959712
cor(Attrition.No$YearsWithCurrManager, Attrition.No$MonthlyIncome)
## [1] 0.2875556
#———evaluate the normality of variables———-
shapiro.test(df$MonthlyRate)
##
## Shapiro-Wilk normality test
##
## data: df$MonthlyRate
## W = 0.9549, p-value = 1.083e-15
shapiro.test(df$PercentSalaryHike)
##
## Shapiro-Wilk normality test
##
## data: df$PercentSalaryHike
## W = 0.89909, p-value < 2.2e-16
shapiro.test(df$MonthlyIncome)
##
## Shapiro-Wilk normality test
##
## data: df$MonthlyIncome
## W = 0.83195, p-value < 2.2e-16
shapiro.test(df$HourlyRate)
##
## Shapiro-Wilk normality test
##
## data: df$HourlyRate
## W = 0.95517, p-value = 1.221e-15
shapiro.test(df$YearsSinceLastPromotion)
##
## Shapiro-Wilk normality test
##
## data: df$YearsSinceLastPromotion
## W = 0.70474, p-value < 2.2e-16
shapiro.test(df$YearsWithCurrManager)
##
## Shapiro-Wilk normality test
##
## data: df$YearsWithCurrManager
## W = 0.89891, p-value < 2.2e-16
shapiro.test(df$YearsAtCompany)
##
## Shapiro-Wilk normality test
##
## data: df$YearsAtCompany
## W = 0.85504, p-value < 2.2e-16
shapiro.test(df$YearsInCurrentRole)
##
## Shapiro-Wilk normality test
##
## data: df$YearsInCurrentRole
## W = 0.89509, p-value < 2.2e-16
shapiro.test(df$TotalWorkingYears)
##
## Shapiro-Wilk normality test
##
## data: df$TotalWorkingYears
## W = 0.90948, p-value < 2.2e-16
shapiro.test(df$NumCompaniesWorked)
##
## Shapiro-Wilk normality test
##
## data: df$NumCompaniesWorked
## W = 0.84746, p-value < 2.2e-16
#———————————- # Man Whitney test to evaluate the difference between numeric variables in people with and without Attrition:
wilcox.test(NumCompaniesWorked ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: NumCompaniesWorked by Attrition
## W = 47486, p-value = 0.1723
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(MonthlyIncome ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: MonthlyIncome by Attrition
## W = 67118, p-value = 4.074e-09
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(PercentSalaryHike ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: PercentSalaryHike by Attrition
## W = 51018, p-value = 0.976
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(HourlyRate ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: HourlyRate by Attrition
## W = 48218, p-value = 0.2901
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(DistanceFromHome ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: DistanceFromHome by Attrition
## W = 45107, p-value = 0.02725
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsSinceLastPromotion ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: YearsSinceLastPromotion by Attrition
## W = 53456, p-value = 0.3681
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsWithCurrManager ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: YearsWithCurrManager by Attrition
## W = 64295, p-value = 9.347e-07
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsAtCompany ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: YearsAtCompany by Attrition
## W = 66124, p-value = 3.11e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(YearsInCurrentRole ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: YearsInCurrentRole by Attrition
## W = 65436, p-value = 9.482e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(TotalWorkingYears ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: TotalWorkingYears by Attrition
## W = 67078, p-value = 4.042e-09
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(MonthlyRate ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: MonthlyRate by Attrition
## W = 54526, p-value = 0.2086
## alternative hypothesis: true location shift is not equal to 0
#MonthlyIncome(p-value = 4.074e-09),DistanceFromHome(p-value = 0.02725),
#YearsWithCurrManager(p-value = 9.347e-07),YearsAtCompany(p-value = 3.11e-08),
#YearsInCurrentRole(p-value =9.482e-08),TotalWorkingYears(p-value = 4.042e-09)
#were different with those with Attrition and those without Attrition
Evaluate the relationship between categorical variables with Attrition using chi chisquare test
chisq.test(df$JobSatisfaction,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$JobSatisfaction and df$Attrition
## X-squared = 11.109, df = 3, p-value = 0.01115
chisq.test(df$OverTime,df$Attrition)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df$OverTime and df$Attrition
## X-squared = 62.762, df = 1, p-value = 2.333e-15
chisq.test(df$JobInvolvement,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$JobInvolvement and df$Attrition
## X-squared = 41.465, df = 3, p-value = 5.211e-09
chisq.test(df$StockOptionLevel,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$StockOptionLevel and df$Attrition
## X-squared = 56.245, df = 3, p-value = 3.724e-12
chisq.test(df$NumCompaniesWorked,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$NumCompaniesWorked and df$Attrition
## X-squared = 20.19, df = 9, p-value = 0.01678
chisq.test(df$WorkLifeBalance,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$WorkLifeBalance and df$Attrition
## X-squared = 14.325, df = 3, p-value = 0.002495
chisq.test(df$TrainingTimesLastYear,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$TrainingTimesLastYear and df$Attrition
## X-squared = 10.132, df = 6, p-value = 0.1192
chisq.test(df$PerformanceRating,df$Attrition)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df$PerformanceRating and df$Attrition
## X-squared = 0.10478, df = 1, p-value = 0.7462
chisq.test(df$RelationshipSatisfaction,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$RelationshipSatisfaction and df$Attrition
## X-squared = 3.1253, df = 3, p-value = 0.3727
chisq.test(df$EnvironmentSatisfaction,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$EnvironmentSatisfaction and df$Attrition
## X-squared = 11.231, df = 3, p-value = 0.01054
chisq.test(df$JobLevel,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$JobLevel and df$Attrition
## X-squared = 41.533, df = 4, p-value = 2.085e-08
chisq.test(df$JobRole,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$JobRole and df$Attrition
## X-squared = 60.543, df = 8, p-value = 3.647e-10
chisq.test(df$Department,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$Department and df$Attrition
## X-squared = 9.329, df = 2, p-value = 0.009424
chisq.test(df$NumCompaniesWorked,df$Attrition)
##
## Pearson's Chi-squared test
##
## data: df$NumCompaniesWorked and df$Attrition
## X-squared = 20.19, df = 9, p-value = 0.01678
#JobSatisfaction(p-value = 0.01115), OverTime(p-value = 2.333e-15),JobInvolvement(p-value = 5.211e-09),
# StockOptionLevel(p-value = 3.724e-12),WorkLifeBalance(p-value = 0.002495),df$EnvironmentSatisfaction(p-value = 0.01054)
#Department(p-value = 0.009424),df$JobRole(p-value = 3.647e-10),NumCompaniesWorked(p-value = 0.01678)
cor.test(df$Age, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$Age and df$MonthlyIncome
## S = 60400498, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4496556
#There is a positive correlation between age and income(R2=0.44,p-value < 2.2e-16)
cor.test(df$PercentSalaryHike, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$PercentSalaryHike and df$MonthlyIncome
## S = 116154632, p-value = 0.0854
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.05835313
#There is a weak correlation between PercentSalaryHike and income
cor.test(df$HourlyRate, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$HourlyRate and df$MonthlyIncome
## S = 110777571, p-value = 0.7828
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.00935957
#There is a weak correlation between HourlyRate and income
cor.test(df$DistanceFromHome, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$DistanceFromHome and df$MonthlyIncome
## S = 107844895, p-value = 0.6091
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.01736176
#There is a weak correlation between DistanceFromHome and income
cor.test(df$YearsSinceLastPromotion, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$YearsSinceLastPromotion and df$MonthlyIncome
## S = 80301098, p-value = 8.191e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2683295
#There is a positive correlation between YearsSinceLastPromotion and income(R2=0.2683295,p-value = 8.191e-16)
cor.test(df$YearsWithCurrManager, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$YearsWithCurrManager and df$MonthlyIncome
## S = 69749771, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.3644688
#There is a positive correlation between YearsWithCurrManager and income(R2=0.3644688, p-value < 2.2e-16)
cor.test(df$YearsAtCompany, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$YearsAtCompany and df$MonthlyIncome
## S = 59312414, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4595697
#There is a positive correlation between YearsAtCompany and income(R2=0.4595697 , p-value < 2.2e-16)
cor.test(df$YearsInCurrentRole, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$YearsInCurrentRole and df$MonthlyIncome
## S = 65683339, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4015205
#There is a positive correlation between YearsInCurrentRole and income(R2=0.4015205 , p-value < 2.2e-16)
cor.test(df$TotalWorkingYears, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$TotalWorkingYears and df$MonthlyIncome
## S = 31181970, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7158827
#There is a strong positive correlation between TotalWorkingYears and income(R2=0.7158827 , p-value < 2.2e-16)
cor.test(df$MonthlyRate, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$MonthlyRate and df$MonthlyIncome
## S = 100095425, p-value = 0.009429
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.08797174
#-------------------------------
wilcox.test(MonthlyIncome ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: MonthlyIncome by Attrition
## W = 67118, p-value = 4.074e-09
## alternative hypothesis: true location shift is not equal to 0
#Monthlyincome signifficantly different in people with or without Attrition d
#p-value = 4.074e-09
### 2. Monthly Income vs BusinessTravel
kruskal.test(MonthlyIncome ~ BusinessTravel,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by BusinessTravel
## Kruskal-Wallis chi-squared = 2.2416, df = 2, p-value = 0.326
kruskal.test(MonthlyIncome ~ NumCompaniesWorked,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by NumCompaniesWorked
## Kruskal-Wallis chi-squared = 54.478, df = 9, p-value = 1.531e-08
### 3. Monthly Income vs Department
kruskal.test(MonthlyIncome ~ Department,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by Department
## Kruskal-Wallis chi-squared = 25.546, df = 2, p-value = 2.836e-06
#Monthly income has a significant relationship with Department(p-value = 2.836e-06)
### 4. Monthly Income vs EducationField
kruskal.test(MonthlyIncome ~ EducationField,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by EducationField
## Kruskal-Wallis chi-squared = 15.544, df = 5, p-value = 0.008274
#Monthly income has a significant relationship with EducationField(p-value = 0.008274)
### 5. Monthly Income vs Gender
wilcox.test(MonthlyIncome ~ Gender,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: MonthlyIncome by Gender
## W = 98590, p-value = 0.04623
## alternative hypothesis: true location shift is not equal to 0
#p-value = 0.04623
### 6. Monthly Income vs JobRole
kruskal.test(MonthlyIncome ~ JobRole,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by JobRole
## Kruskal-Wallis chi-squared = 636.1, df = 8, p-value < 2.2e-16
#p-value < 2.2e-16
### 7. Monthly Income vs MaritalStatus
kruskal.test(MonthlyIncome ~ MaritalStatus,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by MaritalStatus
## Kruskal-Wallis chi-squared = 9.358, df = 2, p-value = 0.009288
#p-value = 0.009288
### 8. Monthly Income vs OverTime
wilcox.test(MonthlyIncome ~ OverTime,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: MonthlyIncome by OverTime
## W = 79554, p-value = 0.6161
## alternative hypothesis: true location shift is not equal to 0
### 9. Monthly Income vs Education
kruskal.test(MonthlyIncome ~ Education,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by Education
## Kruskal-Wallis chi-squared = 20.448, df = 4, p-value = 0.0004072
#p-value = 0.0004072
### 10. Monthly Income vs EnvironmentSatisfaction
kruskal.test(MonthlyIncome ~ EnvironmentSatisfaction,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by EnvironmentSatisfaction
## Kruskal-Wallis chi-squared = 1.4961, df = 3, p-value = 0.6832
### 11. Monthly Income vs JobInvolvement
kruskal.test(MonthlyIncome ~ JobInvolvement,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by JobInvolvement
## Kruskal-Wallis chi-squared = 0.24444, df = 3, p-value = 0.9701
### 12. Monthly Income vs JobLevel
kruskal.test(MonthlyIncome ~ JobLevel,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by JobLevel
## Kruskal-Wallis chi-squared = 744.02, df = 4, p-value < 2.2e-16
#p-value < 2.2e-16
### 13. Monthly Income vs JobSatisfaction
kruskal.test(MonthlyIncome ~ JobSatisfaction,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by JobSatisfaction
## Kruskal-Wallis chi-squared = 1.3648, df = 3, p-value = 0.7138
### 14. Monthly Income vs RelationshipSatisfaction
kruskal.test(MonthlyIncome ~ RelationshipSatisfaction,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by RelationshipSatisfaction
## Kruskal-Wallis chi-squared = 1.4622, df = 3, p-value = 0.691
### 15. Monthly Income vs StockOptionLevel
kruskal.test(MonthlyIncome ~ StockOptionLevel,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by StockOptionLevel
## Kruskal-Wallis chi-squared = 8.8154, df = 3, p-value = 0.03185
#p-value = 0.03185
### 16. Monthly Income vs WorkLifeBalance
kruskal.test(MonthlyIncome ~ WorkLifeBalance,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by WorkLifeBalance
## Kruskal-Wallis chi-squared = 1.1367, df = 3, p-value = 0.7682
####17. Monthly Income vs NumCompaniesWorked
kruskal.test(MonthlyIncome ~ NumCompaniesWorked,data = df)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by NumCompaniesWorked
## Kruskal-Wallis chi-squared = 54.478, df = 9, p-value = 1.531e-08
#p-value = 1.531e-08
# Median monthly incomes are not significantly differ in BusinessTravel, EnvironmentSatisfaction,
# OverTime,JobInvolvement,JobSatisfaction, RelationshipSatisfaction and WorkLifeBalance variables.
#Monthly income had a significant relationship with StockOptionLevel,JobLevel, Education,
#MaritalStatus, JobRole, Gender, EducationField, Department, Attrition,
#NumCompaniesWorked
df=df%>%
mutate(year2=YearsInCurrentRole/YearsAtCompany)%>%
mutate(year4=YearsInCurrentRole/TotalWorkingYears)%>%
mutate(year5=df$YearsAtCompany/TotalWorkingYears)
cs2.NoAttrition=cs2.NoAttrition%>%
mutate(year2=cs2.NoAttrition$YearsInCurrentRole/cs2.NoAttrition$YearsAtCompany)%>%
mutate(year4=cs2.NoAttrition$YearsInCurrentRole/cs2.NoAttrition$TotalWorkingYears)%>%
mutate(year5=cs2.NoAttrition$YearsAtCompany/cs2.NoAttrition$TotalWorkingYears)
cs2.NoSalary=cs2.NoSalary%>%
mutate(year2=cs2.NoSalary$YearsInCurrentRole/cs2.NoSalary$YearsAtCompany)%>%
mutate(year4=cs2.NoSalary$YearsInCurrentRole/cs2.NoSalary$TotalWorkingYears)%>%
mutate(year5=cs2.NoSalary$YearsAtCompany/cs2.NoSalary$TotalWorkingYears)
df$year2=as.numeric(df$year2)
df$year4=as.numeric(df$year4)
df$year5=as.numeric(df$year5)
cs2.NoAttrition$year2=as.numeric(cs2.NoAttrition$year2)
cs2.NoAttrition$year4=as.numeric(cs2.NoAttrition$year4)
cs2.NoAttrition$year5=as.numeric(cs2.NoAttrition$year5)
cs2.NoSalary$year2=as.numeric(cs2.NoSalary$year2)
cs2.NoSalary$year4=as.numeric(cs2.NoSalary$year4)
cs2.NoSalary$year5=as.numeric(cs2.NoSalary$year5)
Dealing with missing data on in the new datasets
plot_missing(df)
sum(is.na(df))
## [1] 42
for(i in 1:ncol(df))
{
if(is.numeric(df[,i]))
{
df[is.na(df[,i]), i] <- median(df[,i], na.rm = TRUE)
}
}
sum(is.na(df))
## [1] 0
for(i in 1:ncol(cs2.NoAttrition))
{
if(is.numeric(cs2.NoAttrition[,i]))
{
cs2.NoAttrition[is.na(cs2.NoAttrition[,i]), i] <- median(cs2.NoAttrition[,i], na.rm = TRUE)
}
}
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(cs2.NoSalary))
{
if(is.numeric(cs2.NoSalary[,i]))
{
cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
}
}
plot_missing(df)
sum(is.na(df))
## [1] 0
statistical analysis on the new variables:
wilcox.test(year2 ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: year2 by Attrition
## W = 57437, p-value = 0.01949
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(year4 ~ Attrition,data = df, alternative = "two.sided")
##
## Wilcoxon rank sum test with continuity correction
##
## data: year4 by Attrition
## W = 58776, p-value = 0.004714
## alternative hypothesis: true location shift is not equal to 0
cor.test(df$year5, df$MonthlyIncome,
method= "spearman",
exact=FALSE,
alternative="two.side")
##
## Spearman's rank correlation rho
##
## data: df$year5 and df$MonthlyIncome
## S = 1.29e+08, p-value = 1.912e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1754371
define the most important variables to determine attrition
library(xgboost)
tree <- rpart(Attrition ~ ., data = df)
# Fit an RF
set.seed(101) # for reproducibility
rfo <- randomForest(Attrition ~ ., data = df, importance = TRUE)
# Fit a GBM
set.seed(102) # for reproducibility
# Extract VI scores from each model
vi_tree <- tree$variable.importance
vi_rfo <- rfo$variable.importance # or use `randomForest::importance(rfo)`
# Load required packages
library(vip)
# Compute model-specific VI scores
vi(tree) # CART-like decision tree
## # A tibble: 26 × 2
## Variable Importance
## <chr> <dbl>
## 1 MonthlyIncome 21.2
## 2 OverTime 17.4
## 3 DailyRate 10.5
## 4 Age 8.62
## 5 StockOptionLevel 8.34
## 6 JobRole 6.97
## 7 TotalWorkingYears 6.60
## 8 JobSatisfaction 6.12
## 9 MaritalStatus 5.97
## 10 year4 5.91
## # … with 16 more rows
vi(rfo) # RF
## # A tibble: 39 × 2
## Variable Importance
## <chr> <dbl>
## 1 OverTime 18.3
## 2 MonthlyIncome 11.5
## 3 StockOptionLevel 11.5
## 4 Age 10.2
## 5 JobRole 9.03
## 6 MaritalStatus 8.71
## 7 YearsAtCompany 7.74
## 8 JobLevel 6.76
## 9 JobInvolvement 6.35
## 10 TotalWorkingYears 6.05
## # … with 29 more rows
p1 <- vip(tree) + ggtitle("Single tree")
p2 <- vip(rfo) + ggtitle("Random forest")
# Display plots in a grid (Figure 1)
grid.arrange(p1, p2, p3, nrow = 1)
#Creat new datasets with the most important variables
df_attrition=df%>%
select(OverTime
,JobSatisfaction,
JobInvolvement,
StockOptionLevel,
WorkLifeBalance,
EnvironmentSatisfaction,
Department,
JobRole,
NumCompaniesWorked,
MonthlyIncome,
DistanceFromHome,
YearsWithCurrManager,
YearsAtCompany,
YearsInCurrentRole,
TotalWorkingYears,
year2,
year4
)
cs2.NoAttrition=cs2.NoAttrition%>%select(
OverTime,
JobSatisfaction,
JobInvolvement,
StockOptionLevel,
WorkLifeBalance,
EnvironmentSatisfaction,
Department,
JobRole,
NumCompaniesWorked,
MonthlyIncome,
DistanceFromHome,
YearsWithCurrManager,
YearsAtCompany,
YearsInCurrentRole,
TotalWorkingYears,
year2,
year4,
)
df_for_salary=df%>%
select(Age, YearsSinceLastPromotion, YearsSinceLastPromotion,
YearsWithCurrManager,
year5,
YearsAtCompany,
YearsInCurrentRole,
TotalWorkingYears,
StockOptionLevel,
JobLevel,
Education,
MaritalStatus,
JobRole,
Gender,
EducationField,
Department,
Attrition,
NumCompaniesWorked)
cs2.NoSalary=cs2.NoSalary%>%
select(Age, YearsSinceLastPromotion, YearsSinceLastPromotion,
YearsWithCurrManager,
year5,
YearsAtCompany,
YearsInCurrentRole,
TotalWorkingYears,
StockOptionLevel,
JobLevel,
Education,
MaritalStatus,
JobRole,
Gender,
EducationField,
Department,
Attrition,
NumCompaniesWorked
)
#Converting factor variables to numeric variables(dummies)
dmy <- dummyVars(" ~ .", data = df_attrition)
df_attrition1 <- data.frame(predict(dmy, newdata = df_attrition))
dmy1=dummyVars(" ~ .", data = cs2.NoAttrition)
cs2.NoAttrition_test<- data.frame(predict(dmy1, newdata = cs2.NoAttrition))
dmy2<- dummyVars(" ~ .", data = df_for_salary)
df_for_salary1 <- data.frame(predict(dmy2, newdata = df_for_salary))
cs2.NoSalary$StockOptionLevel=as.factor(cs2.NoSalary$StockOptionLevel)
dmy3=dummyVars(" ~ .", data = cs2.NoSalary)
cs2.NoSalary_test<- data.frame(predict(dmy3, newdata = cs2.NoSalary))
df_attrition2=cbind(df_attrition1,new_col=df["Attrition"])
df_for_salary2=cbind(df_for_salary1,new_col=df["MonthlyIncome"])
#Normalizing the numeric variables
library(tidyverse)
normalize=function(x){
return((x-min(x))/(max(x)-min(x)))
}
df_attrition2=df_attrition2%>%
mutate(DistanceFromHome=normalize(DistanceFromHome))%>%
mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
mutate(TotalWorkingYears=normalize(TotalWorkingYears))%>%
mutate(YearsInCurrentRole=normalize(YearsInCurrentRole))%>%
mutate(MonthlyIncome=normalize(MonthlyIncome))
cs2.NoAttrition_test= cs2.NoAttrition_test%>%
mutate(DistanceFromHome=normalize(DistanceFromHome))%>%
mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
mutate(TotalWorkingYears=normalize(TotalWorkingYears))%>%
mutate(YearsInCurrentRole=normalize(YearsInCurrentRole))%>%
mutate(MonthlyIncome=normalize(MonthlyIncome))
df_for_salary2=df_for_salary2%>%
mutate(Age=normalize(Age))%>%
mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
mutate(YearsSinceLastPromotion=normalize(YearsSinceLastPromotion))%>%
mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
mutate(year5=normalize(year5))%>%
mutate(TotalWorkingYears=normalize(TotalWorkingYears))
cs2.NoSalary_test= cs2.NoSalary_test%>%
mutate(Age=normalize(Age))%>%
mutate(YearsWithCurrManager=normalize(YearsWithCurrManager))%>%
mutate(YearsSinceLastPromotion=normalize(YearsSinceLastPromotion))%>%
mutate(YearsAtCompany=normalize(YearsAtCompany))%>%
mutate(year5=normalize(year5))%>%
mutate(TotalWorkingYears=normalize(TotalWorkingYears))
#Dealing with the missing data
plot_missing(df)
sum(is.na(df))
## [1] 0
for(i in 1:ncol(df))
{
if(is.numeric(df[,i]))
{
df[is.na(df[,i]), i] <- median(df[,i], na.rm = TRUE)
}
}
sum(is.na(df))
## [1] 0
for(i in 1:ncol(cs2.NoAttrition))
{
if(is.numeric(cs2.NoAttrition[,i]))
{
cs2.NoAttrition[is.na(cs2.NoAttrition[,i]), i] <- median(cs2.NoAttrition[,i], na.rm = TRUE)
}
}
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(cs2.NoSalary))
{
if(is.numeric(cs2.NoSalary[,i]))
{
cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
}
}
plot_missing(df)
sum(is.na(df))
## [1] 0
#Using naive bayes to predict attrition
set.seed(1234)
sample_set=sample(nrow(df_attrition2),round(nrow(df_attrition2)*0.75),replace=FALSE)
df_attrition2_train=df_attrition2[sample_set,]
df_attrition2_test=df_attrition2[-sample_set,]
naive = naiveBayes(Attrition ~ ., data = df_attrition2_train)
pred =predict(naive,df_attrition2_test,type="class")
pred_table=table(df_attrition2_test$Attrition,pred)
pred_table
## pred
## No Yes
## No 130 49
## Yes 8 31
sum(diag(pred_table))/nrow(df_attrition2_test)
## [1] 0.7385321
tab.naive = table(predicted = pred,Actual = df_attrition2_test$Attrition)
(sensitivity = round(tab.naive[2,2]*100/(tab.naive[2,2] + tab.naive[1,2])))
## [1] 79
(specificity = round(tab.naive[1,1]*100/(tab.naive[1,1] + tab.naive[2,1])))
## [1] 73
final.cls.model = naiveBayes(Attrition ~ ., data = df_attrition2)
pred4 = predict(final.cls.model, newdata = df_attrition2)
tab.naive = table(predicted = pred4,Actual = df_attrition2$Attrition)
(sensitivity = round(tab.naive[2,2]*100/(tab.naive[2,2] + tab.naive[1,2])))
## [1] 80
(specificity = round(tab.naive[1,1]*100/(tab.naive[1,1] + tab.naive[2,1])))
## [1] 72
attrition_matrix=confusionMatrix(pred4,df_attrition2$Attrition,positive= "Yes")
attrition_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 525 28
## Yes 205 112
##
## Accuracy : 0.7322
## 95% CI : (0.7014, 0.7614)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3436
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8000
## Specificity : 0.7192
## Pos Pred Value : 0.3533
## Neg Pred Value : 0.9494
## Prevalence : 0.1609
## Detection Rate : 0.1287
## Detection Prevalence : 0.3644
## Balanced Accuracy : 0.7596
##
## 'Positive' Class : Yes
##
precision=posPredValue(pred4,df_attrition2$Attrition,positive= "Yes")
precision
## [1] 0.3533123
sensitivity=sensitivity(pred4,df_attrition2$Attrition,positive= "Yes")
sensitivity
## [1] 0.8
specificity=specificity(pred,df_attrition2$Attrition,negative= "No")
specificity
## [1] 0.6328767
f1_score=(2*precision*sensitivity)/(precision+sensitivity)
f1_score
## [1] 0.4901532
predfinal=predict(final.cls.model,newdata=cs2.NoAttrition_test)
predfinal
## [1] No Yes Yes No No No No No No Yes Yes Yes No No No No No No
## [19] No No Yes No No Yes Yes No Yes Yes No Yes Yes No No Yes Yes No
## [37] No No No No Yes Yes No No Yes No No No Yes No No Yes No No
## [55] No No No Yes No No No No Yes No No Yes No No No No Yes No
## [73] No No No Yes No Yes No Yes No No No No No No No No No Yes
## [91] No No Yes No No No Yes No Yes Yes No No No No Yes No No No
## [109] Yes No No No No No No No No No Yes No No No No No No No
## [127] Yes No No No Yes No No Yes No No No Yes Yes Yes Yes No No Yes
## [145] No Yes Yes Yes Yes No No Yes No No Yes Yes Yes Yes Yes No No No
## [163] Yes No No Yes No No No Yes No Yes Yes Yes No No No Yes No No
## [181] Yes No No No No No No Yes No No Yes No No No Yes Yes Yes No
## [199] No No No No Yes No No No No No Yes No Yes No No No No Yes
## [217] No No No No Yes No No No No No Yes No Yes No Yes Yes No Yes
## [235] No No Yes Yes No No Yes No No Yes No No Yes No No Yes Yes No
## [253] No Yes No No No No No No No No No No No No No No Yes No
## [271] Yes No No Yes Yes Yes Yes No Yes Yes No Yes No No Yes Yes Yes Yes
## [289] Yes Yes No No No No Yes No No Yes No No
## Levels: No Yes
cs2.NoAttrition = read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Attrition.csv",stringsAsFactors = TRUE)
pred.df = data.frame(ID = cs2.NoAttrition$ID, Attrition = predfinal)
#——————————— # Variable Importance based on naiive bayes model
Grid = data.frame(usekernel=TRUE,laplace = 0,adjust=1)
mdl = train(Attrition ~ .,data=df_attrition2,method="naive_bayes",
trControl=trainControl(method="none"),
tuneGrid=Grid)
varImp(mdl)
## ROC curve variable importance
##
## only 20 most important variables shown (out of 43)
##
## Importance
## OverTimeNo 100.00
## OverTimeYes 100.00
## StockOptionLevel.0 93.76
## MonthlyIncome 93.29
## TotalWorkingYears 93.05
## YearsAtCompany 87.44
## YearsInCurrentRole 83.39
## YearsWithCurrManager 76.68
## StockOptionLevel.1 76.15
## year4 44.21
## JobInvolvement.3 39.26
## JobSatisfaction.4 39.00
## JobRole.Sales.Representative 38.65
## DepartmentResearch...Development 38.56
## DepartmentSales 37.62
## year2 36.34
## JobInvolvement.1 36.00
## EnvironmentSatisfaction.1 35.71
## DistanceFromHome 34.31
## JobRole.Manufacturing.Director 29.76
for(i in 1:ncol(df))
{
if(is.numeric(df[,i]))
{
df[is.na(df[,i]), i] <- median(df[,i], na.rm = TRUE)
}
}
sum(is.na(df))
## [1] 0
for(i in 1:ncol(cs2.NoAttrition))
{
if(is.numeric(cs2.NoAttrition[,i]))
{
cs2.NoAttrition[is.na(cs2.NoAttrition[,i]), i] <- median(cs2.NoAttrition[,i], na.rm = TRUE)
}
}
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(cs2.NoSalary))
{
if(is.numeric(cs2.NoSalary[,i]))
{
cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
}
}
plot_missing(df)
sum(is.na(df))
## [1] 0
sum(is.na(cs2.NoAttrition))
## [1] 0
for(i in 1:ncol(df_attrition2))
{
if(is.numeric(df_attrition2[,i]))
{
df_attrition2[is.na(df_attrition2[,i]), i] <- median(df_attrition2[,i], na.rm = TRUE)
}
}
sum(is.na(df_attrition2))
## [1] 0
#Use of GBM model to predict attrition ### GBM ###
library(gbm)
df_attrition2$Attrition = ifelse(df_attrition2$Attrition == "Yes",1,0)
gbm.mod_final = gbm(formula = Attrition ~ ., data = df_attrition2,interaction.depth = 3,
distribution = "bernoulli", n.trees = 5000,shrinkage = 0.1,
n.minobsinnode = 10,cv.folds = 10)
pred10 = predict(gbm.mod_final, newdata = df_attrition2,type = "response")
pred10 = ifelse(pred10 > 0.5,1,0)
tab.gbm = table(predicted = pred10,Actual = df_attrition2$Attrition)
(sensitivity = round(tab.gbm[2,2]*100/(tab.gbm[2,2] + tab.gbm[1,2])))
## [1] 51
(specificity = round(tab.gbm[1,1]*100/(tab.gbm[1,1] + tab.gbm[2,1])))
## [1] 99
#GBM had a sensitivity of 99% , however the speficiy was not high
#Use of KNN for prediction od attrition
#——————-KNN———————-
library(class)
idx = sample.int(n = nrow(df_attrition2), size = floor(0.75*nrow(df_attrition2)), replace = F)
train = df_attrition2[idx,]
test = df_attrition2[-idx,]
trn_target = train$Attrition
trn = train[,-44]
tst_target = test$Attrition
tst = test[,-44]
pred = knn(train = trn, test = tst, cl = trn_target, k = 6)
model_table=table(tst_target,pred)
model_table
## pred
## tst_target 0 1
## 0 179 5
## 1 29 5
sum(diag(model_table))/nrow(tst)
## [1] 0.8440367
Accuracy = NULL
mis = NULL
sen = NULL
spe = NULL
for(i in 1:50)
{
pred = knn(train = trn, test = tst, cl = trn_target, k = i)
head(pred)
model_table=table(trn_target)
tab = table(Predicted = pred, Real = tst_target)
Accuracy[i] = ((tab[1,1] + tab[2,2])/sum(tab))*100
mis[i] = round((tab[1,2]+tab[2,1])/sum(tab),2)
sen[i] = round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
spe[i] = round(tab[1,1]/(tab[1,1]+tab[2,1]),2)
}
plot(x = c(1:50), y = Accuracy, xlab = "k", pch = 19, type = "b")
abline(v = which.max(Accuracy), col = "red", lwd = 2)
data.frame(Measure = c("Accuracy","Misclassification Rate","Sensitivity","Specificity"),
Value = c(round(Accuracy[6],2),round(mis[6],2),round(sen[6],2),round(spe[6],2)))
## Measure Value
## 1 Accuracy 84.40
## 2 Misclassification Rate 0.16
## 3 Sensitivity 0.15
## 4 Specificity 0.97
Attrition=cs2.NoAttrition_test$Attrition
#———————————–
for(i in 1:ncol(cs2.NoSalary))
{
if(is.numeric(cs2.NoSalary[,i]))
{
cs2.NoSalary[is.na(cs2.NoSalary[,i]), i] <- median(cs2.NoSalary[,i], na.rm = TRUE)
}
}
set.seed(2021)
train_ind = sample(seq_len(nrow(df_for_salary2)), size = floor(0.7 * nrow(df_for_salary2)))
train = df_for_salary2[train_ind, ]
test = df_for_salary2[-train_ind, ]
model2 = lm(MonthlyIncome ~ ., data = train)
summary(model2)
##
## Call:
## lm(formula = MonthlyIncome ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2868.1 -647.5 -99.7 627.6 4304.6
##
## Coefficients: (9 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13341.88 638.10 20.909 < 2e-16 ***
## Age -268.64 276.89 -0.970 0.3324
## YearsSinceLastPromotion 436.68 267.49 1.632 0.1031
## YearsWithCurrManager 33.59 329.79 0.102 0.9189
## year5 -585.45 287.94 -2.033 0.0425 *
## YearsAtCompany 424.58 745.11 0.570 0.5690
## YearsInCurrentRole 27.52 20.53 1.340 0.1806
## TotalWorkingYears 1135.04 645.83 1.757 0.0794 .
## StockOptionLevel.0 -114.09 208.17 -0.548 0.5839
## StockOptionLevel.1 40.28 172.88 0.233 0.8159
## StockOptionLevel.2 -12.94 210.30 -0.062 0.9510
## StockOptionLevel.3 NA NA NA NA
## JobLevel.1 -10678.14 413.78 -25.806 < 2e-16 ***
## JobLevel.2 -9266.65 354.70 -26.125 < 2e-16 ***
## JobLevel.3 -6081.86 323.01 -18.828 < 2e-16 ***
## JobLevel.4 -2660.55 276.21 -9.632 < 2e-16 ***
## JobLevel.5 NA NA NA NA
## Education.1 178.76 270.95 0.660 0.5097
## Education.2 201.01 251.16 0.800 0.4239
## Education.3 171.50 243.11 0.705 0.4808
## Education.4 294.01 247.16 1.190 0.2347
## Education.5 NA NA NA NA
## MaritalStatusDivorced -304.53 181.77 -1.675 0.0944 .
## MaritalStatusMarried -37.56 145.48 -0.258 0.7963
## MaritalStatusSingle NA NA NA NA
## JobRole.Healthcare.Representative 729.54 472.25 1.545 0.1229
## JobRole.Human.Resources 178.55 610.32 0.293 0.7700
## JobRole.Laboratory.Technician -662.95 435.08 -1.524 0.1281
## JobRole.Manager 4632.79 425.54 10.887 < 2e-16 ***
## JobRole.Manufacturing.Director 913.72 469.55 1.946 0.0522 .
## JobRole.Research.Director 4446.30 505.61 8.794 < 2e-16 ***
## JobRole.Research.Scientist -462.98 435.64 -1.063 0.2883
## JobRole.Sales.Executive 1490.50 254.09 5.866 7.57e-09 ***
## JobRole.Sales.Representative NA NA NA NA
## GenderFemale -35.47 87.02 -0.408 0.6837
## GenderMale NA NA NA NA
## EducationFieldHuman.Resources -63.83 452.84 -0.141 0.8880
## EducationFieldLife.Sciences 167.75 157.06 1.068 0.2859
## EducationFieldMarketing 95.59 204.94 0.466 0.6411
## EducationFieldMedical -23.85 163.47 -0.146 0.8841
## EducationFieldOther 36.61 221.01 0.166 0.8685
## EducationFieldTechnical.Degree NA NA NA NA
## DepartmentHuman.Resources 278.58 572.14 0.487 0.6265
## DepartmentResearch...Development 693.29 390.12 1.777 0.0761 .
## DepartmentSales NA NA NA NA
## Attrition.No 16.61 122.87 0.135 0.8925
## Attrition.Yes NA NA NA NA
## NumCompaniesWorked -12.93 20.79 -0.622 0.5342
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1016 on 570 degrees of freedom
## Multiple R-squared: 0.9553, Adjusted R-squared: 0.9524
## F-statistic: 320.9 on 38 and 570 DF, p-value: < 2.2e-16
pred1 = predict(model2, newdata = test)
(RMSE.test = sqrt(mean((pred1 - test$MonthlyIncome)^2)))
## [1] 1032.488
final.reg.model = lm(MonthlyIncome ~ ., data = df_for_salary2)
summary(final.reg.model)
##
## Call:
## lm(formula = MonthlyIncome ~ ., data = df_for_salary2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3156.3 -650.8 -81.5 590.5 4268.7
##
## Coefficients: (9 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13586.419 540.253 25.148 < 2e-16 ***
## Age -137.647 227.992 -0.604 0.5462
## YearsSinceLastPromotion 176.497 219.869 0.803 0.4224
## YearsWithCurrManager -60.752 278.907 -0.218 0.8276
## year5 -585.410 239.165 -2.448 0.0146 *
## YearsAtCompany 538.091 630.979 0.853 0.3940
## YearsInCurrentRole 27.348 16.480 1.660 0.0974 .
## TotalWorkingYears 939.857 552.371 1.701 0.0892 .
## StockOptionLevel.0 -34.970 177.729 -0.197 0.8441
## StockOptionLevel.1 67.901 151.455 0.448 0.6540
## StockOptionLevel.2 -18.330 181.820 -0.101 0.9197
## StockOptionLevel.3 NA NA NA NA
## JobLevel.1 -11179.803 343.257 -32.570 < 2e-16 ***
## JobLevel.2 -9461.699 295.101 -32.063 < 2e-16 ***
## JobLevel.3 -6205.247 267.645 -23.185 < 2e-16 ***
## JobLevel.4 -2729.978 227.777 -11.985 < 2e-16 ***
## JobLevel.5 NA NA NA NA
## Education.1 500.210 233.325 2.144 0.0323 *
## Education.2 430.953 219.298 1.965 0.0497 *
## Education.3 379.028 213.130 1.778 0.0757 .
## Education.4 540.030 215.779 2.503 0.0125 *
## Education.5 NA NA NA NA
## MaritalStatusDivorced -107.825 149.447 -0.721 0.4708
## MaritalStatusMarried -23.053 117.309 -0.197 0.8443
## MaritalStatusSingle NA NA NA NA
## JobRole.Healthcare.Representative 712.045 383.363 1.857 0.0636 .
## JobRole.Human.Resources -128.198 501.019 -0.256 0.7981
## JobRole.Laboratory.Technician -488.433 353.682 -1.381 0.1677
## JobRole.Manager 4242.045 341.116 12.436 < 2e-16 ***
## JobRole.Manufacturing.Director 840.370 381.598 2.202 0.0279 *
## JobRole.Research.Director 4247.095 415.238 10.228 < 2e-16 ***
## JobRole.Research.Scientist -270.775 352.637 -0.768 0.4428
## JobRole.Sales.Executive 1184.899 205.099 5.777 1.07e-08 ***
## JobRole.Sales.Representative NA NA NA NA
## GenderFemale -85.110 71.102 -1.197 0.2316
## GenderMale NA NA NA NA
## EducationFieldHuman.Resources 9.144 366.428 0.025 0.9801
## EducationFieldLife.Sciences 84.397 129.844 0.650 0.5159
## EducationFieldMarketing 49.249 171.111 0.288 0.7736
## EducationFieldMedical 8.696 134.686 0.065 0.9485
## EducationFieldOther 16.237 186.473 0.087 0.9306
## EducationFieldTechnical.Degree NA NA NA NA
## DepartmentHuman.Resources 328.549 472.078 0.696 0.4866
## DepartmentResearch...Development 491.587 315.076 1.560 0.1191
## DepartmentSales NA NA NA NA
## Attrition.No 39.908 102.509 0.389 0.6971
## Attrition.Yes NA NA NA NA
## NumCompaniesWorked -7.074 17.501 -0.404 0.6862
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1008 on 831 degrees of freedom
## Multiple R-squared: 0.954, Adjusted R-squared: 0.9519
## F-statistic: 454 on 38 and 831 DF, p-value: < 2.2e-16
pred1 = predict(final.reg.model, newdata = df_for_salary2)
(RMSE.tr.reg = sqrt(mean((pred1 - df_for_salary2$MonthlyIncome)^2)))
## [1] 985.0243
lets do some data preprocessing
cs2.NoSalary$StockOptionLevel=as.factor(cs2.NoSalary$StockOptionLevel)
cs2.NoSalary_test$MonthlyIncome=NaN
lets do some data preprocessing
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Sales"] <- "DepartmentSales"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.No"] <- "OverTimeNo"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Sales"] <- "DepartmentSales"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.No"] <- "OverTimeNo"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Human.Resources"] <- "DepartmentHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Research...Development"] <- "DepartmentResearch...Development"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Department.Sales"] <- "DepartmentSales"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Life.Sciences"] <- "EducationFieldLife.Sciences"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Human.Resources"] <- "EducationFieldHuman.Resources"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Marketing"] <- "EducationFieldMarketing"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Medical"] <- "EducationFieldMedical"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Other"] <- "EducationFieldOther"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "EducationField.Technical.Degree"] <- "EducationFieldTechnical.Degree"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Female"] <- "GenderFemale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "Gender.Male"] <- "GenderMale"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Divorced"] <- "MaritalStatusDivorced"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Married"] <- "MaritalStatusMarried"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "MaritalStatus.Single"] <- "MaritalStatusSingle"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.No"] <- "OverTimeNo"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
names(cs2.NoSalary_test)[names(cs2.NoSalary_test) == "OverTime.Yes"] <- "OverTimeYes"
cs2.NoSalary1 = read.csv("/Users/owner/Desktop/homework/unit14,15(case sudy)/Unit%2014%20and%2015%20Case%20Study%202 2/CaseStudy02/CaseStudy2CompSet No Salary.csv",stringsAsFactors = TRUE)
pred2 = predict(final.reg.model, newdata = cs2.NoSalary_test)
pred.df = data.frame(ID = cs2.NoSalary1$ID, MonthlyIncome = pred2)